{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import svm\n",
    "import pandas as pd\n",
    "from sklearn import model_selection\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 读取外部数据\n",
    "letters = pd.read_csv(r'C:\\Users\\Administrator\\Desktop\\letterdata.csv')\n",
    "# 数据前5行\n",
    "letters.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 将数据拆分为训练集和测试集\n",
    "predictors = letters.columns[1:]\n",
    "X_train,X_test,y_train,y_test = model_selection.train_test_split(letters[predictors], letters.letter, \n",
    "                                                                 test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 使用网格搜索法，选择线性可分SVM“类”中的最佳C值\n",
    "C=[0.05,0.1,0.5,1,2,5]\n",
    "parameters = {'C':C}\n",
    "grid_linear_svc = model_selection.GridSearchCV(estimator = svm.LinearSVC(),param_grid =parameters,scoring='accuracy',cv=5,verbose =1)\n",
    "# 模型在训练数据集上的拟合\n",
    "grid_linear_svc.fit(X_train,y_train)\n",
    "# 返回交叉验证后的最佳参数值\n",
    "grid_linear_svc.best_params_, grid_linear_svc.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 模型在测试集上的预测\n",
    "pred_ linear_svc = grid_linear_svc.predict(X_test)\n",
    "# 模型的预测准确率\n",
    "metrics.accuracy_score(y_test, pred_linear_svc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 使用网格搜索法，选择非线性SVM“类”中的最佳C值\n",
    "kernel=['rbf','linear','poly','sigmoid']\n",
    "C=[0.1,0.5,1,2,5]\n",
    "parameters = {'kernel':kernel,'C':C}\n",
    "grid_svc = model_selection.GridSearchCV(estimator = svm.SVC(),param_grid =parameters,scoring='accuracy',cv=5,verbose =1)\n",
    "# 模型在训练数据集上的拟合\n",
    "grid_svc.fit(X_train,y_train)\n",
    "# 返回交叉验证后的最佳参数值\n",
    "grid_svc.best_params_, grid_svc.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 模型在测试集上的预测\n",
    "pred_svc = grid_svc.predict(X_test)\n",
    "# 模型的预测准确率\n",
    "metrics.accuracy_score(y_test,pred_svc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 读取外部数据\n",
    "forestfires = pd.read_csv(r'C:\\Users\\Administrator\\Desktop\\forestfires.csv')\n",
    "# 数据前5行\n",
    "forestfires.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 删除day变量\n",
    "forestfires.drop('day',axis = 1, inplace = True)\n",
    "# 将月份作数值化处理\n",
    "forestfires.month = pd.factorize(forestfires.month)[0]\n",
    "# 预览数据前5行\n",
    "forestfires.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import norm\n",
    "# 绘制森林烧毁面积的直方图\n",
    "sns.distplot(forestfires.area, bins = 50, kde = True, fit = norm, hist_kws = {'color':'steelblue'}, \n",
    "             kde_kws = {'color':'red', 'label':'Kernel Density'}, \n",
    "             fit_kws = {'color':'black','label':'Nomal', 'linestyle':'--'})\n",
    "# 显示图例\n",
    "plt.legend()\n",
    "# 显示图形\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 导入第三方模块\n",
    "from sklearn import preprocessing\n",
    "import numpy as np\n",
    "from sklearn import neighbors\n",
    "# 对area变量作对数变换\n",
    "y = np.log1p(forestfires.area)\n",
    "# 将X变量作标准化处理\n",
    "predictors = forestfires.columns[:-1]\n",
    "X = preprocessing.scale(forestfires[predictors])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 将数据拆分为训练集和测试集\n",
    "X_train,X_test,y_train,y_test = model_selection.train_test_split(X, y, test_size = 0.25, random_state = 1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 构建默认参数的SVM回归模型\n",
    "svr = svm.SVR()\n",
    "# 模型在训练数据集上的拟合\n",
    "svr.fit(X_train,y_train)\n",
    "# 模型在测试上的预测\n",
    "pred_svr = svr.predict(X_test)\n",
    "# 计算模型的MSE\n",
    "metrics.mean_squared_error(y_test,pred_svr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 使用网格搜索法，选择SVM回归中的最佳C值、epsilon值和gamma值\n",
    "epsilon = np.arange(0.1,1.5,0.2)\n",
    "C= np.arange(100,1000,200)\n",
    "gamma = np.arange(0.001,0.01,0.002)\n",
    "parameters = {'epsilon':epsilon,'C':C,'gamma':gamma}\n",
    "grid_svr = model_selection.GridSearchCV(estimator = svm.SVR(),param_grid =parameters,\n",
    "                                        scoring='neg_mean_squared_error',cv=5,verbose =1, n_jobs=2)\n",
    "# 模型在训练数据集上的拟合\n",
    "grid_svr.fit(X_train,y_train)\n",
    "# 返回交叉验证后的最佳参数值\n",
    "print(grid_svr.best_params_, grid_svr.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 模型在测试集上的预测\n",
    "pred_grid_svr = grid_svr.predict(X_test)\n",
    "# 计算模型在测试集上的MSE值\n",
    "metrics.mean_squared_error(y_test,pred_grid_svr)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [Anaconda3]",
   "language": "python",
   "name": "Python [Anaconda3]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
